{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "f4b72fd7",
   "metadata": {},
   "source": [
    "## Description:\n",
    "这个是sharedBottom模型的demo, 尝试在中级API的基础上，加一些loss优化的思路， 这次是DTP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "324f6ca1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:root:\n",
      "DeepCTR version 0.9.0 detected. Your version is 0.8.2.\n",
      "Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From C:\\Users\\ZHONGQ~1\\AppData\\Local\\Temp/ipykernel_106640/532728279.py:27: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.\n",
      "\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From C:\\Users\\ZHONGQ~1\\AppData\\Local\\Temp/ipykernel_106640/532728279.py:27: The name tf.keras.backend.set_session is deprecated. Please use tf.compat.v1.keras.backend.set_session instead.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import math\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "import random\n",
    "\n",
    "from tqdm import tqdm\n",
    "\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error\n",
    "from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat\n",
    "from deepctr.feature_column import get_feature_names\n",
    "from SharedBottom import SharedBottom\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow.keras import backend as K\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "config = tf.compat.v1.ConfigProto()\n",
    "config.gpu_options.allow_growth = True      # TensorFlow按需分配显存\n",
    "config.gpu_options.per_process_gpu_memory_fraction = 0.5  # 指定显存分配比例\n",
    "tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))\n",
    "\n",
    "# tf.config.experimental_run_functions_eagerly(True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "577efc09",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = '../data_process'\n",
    "data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "88a84ee4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选择出需要用到的列\n",
    "use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'device', 'city', 'age', 'gender', 'img_num', 'cat_1', 'click']\n",
    "data_new = data[use_cols]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "ce2ea472",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 由于这个data_new的数据量还是太大， 我电脑训练不动， 所以这里再进行一波抽样\n",
    "users = set(data_new['user_id'])\n",
    "sampled_users = random.sample(users, 1000)\n",
    "data_new = data_new[data_new['user_id'].isin(sampled_users)]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "df0b628c",
   "metadata": {},
   "source": [
    "## 数据预处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e380f114",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 处理img_num\n",
    "def transform(x):\n",
    "    if x == '上海':\n",
    "        return 0\n",
    "    elif isinstance(x, float):\n",
    "        return float(x)\n",
    "    else:\n",
    "        return float(eval(x))\n",
    "data_new['img_num'] = data_new['img_num'].apply(lambda x: transform(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "46f77eb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "user_id_raw = data_new[['user_id']].drop_duplicates('user_id')\n",
    "doc_id_raw = data_new[['article_id']].drop_duplicates('article_id')\n",
    "\n",
    "# 简单数据预处理\n",
    "sparse_features = [\n",
    "    'user_id', 'article_id', 'net_status', 'exop_position', 'device', 'city', 'age', 'gender', 'cat_1'\n",
    "]\n",
    "dense_features = [\n",
    "    'img_num'\n",
    "]\n",
    "\n",
    "# 填充缺失值\n",
    "data_new[sparse_features] = data_new[sparse_features].fillna('-1')\n",
    "data_new[dense_features] = data_new[dense_features].fillna(0)\n",
    "\n",
    "# 归一化\n",
    "mms = MinMaxScaler(feature_range=(0, 1))\n",
    "data_new[dense_features] = mms.fit_transform(data_new[dense_features])\n",
    "\n",
    "feature_max_idx = {}\n",
    "for feat in sparse_features:\n",
    "    lbe = LabelEncoder()\n",
    "    data_new[feat] = lbe.fit_transform(data_new[feat])\n",
    "    feature_max_idx[feat] = data_new[feat].max() + 1000\n",
    "\n",
    "# 构建用户id词典和doc的id词典，方便从用户idx找到原始的id\n",
    "# user_id_enc = data[['user_id']].drop_duplicates('user_id')\n",
    "# doc_id_enc = data[['article_id']].drop_duplicates('article_id')\n",
    "# user_idx_2_rawid = dict(zip(user_id_enc['user_id'], user_id_raw['user_id']))\n",
    "# doc_idx_2_rawid = dict(zip(doc_id_enc['article_id'], doc_id_raw['article_id']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ba7a17dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这里需要单独把duration标签处理下， 映射到0-1之间\n",
    "duration_mms= MinMaxScaler()\n",
    "data_new['duration'] = duration_mms.fit_transform(data_new['duration'].values.reshape(-1, 1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "ad2691ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分数据集  这里按照曝光时间划分\n",
    "train_data = data_new[data_new['expo_time'] < '2021-07-03']\n",
    "test_data = data_new[data_new['expo_time'] >= '2021-07-06']"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad199918",
   "metadata": {},
   "source": [
    "## 特征封装"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d0e5ea3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "sparse_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=4) for feat in sparse_features]\n",
    "Dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "c9f538ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 划分dnn和linear特征\n",
    "dnn_features_columns = sparse_feature_columns + Dense_feature_columns\n",
    "lhuc_feature_columns = sparse_feature_columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f5355501",
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_names = get_feature_names(dnn_features_columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "c6a5f286",
   "metadata": {},
   "outputs": [],
   "source": [
    "# AttributeError: 'numpy.dtype[int64]' object has no attribute 'base_dtype' \n",
    "# Keras需要把输入声明为Keras张量，其他的比如numpy张量作为输入不好使\n",
    "train_model_input = {name: tf.keras.backend.constant(train_data[name]) for name in feature_names}\n",
    "test_model_input = {name: tf.keras.backend.constant(test_data[name]) for name in feature_names}"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e4803b7",
   "metadata": {},
   "source": [
    "## 模型搭建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "9b0270fe",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = SharedBottom(dnn_features_columns, lhuc_feature_columns, tower_dnn_hidden_units=[], task_types=['regression', 'binary'], \n",
    "             task_names=['duration', 'click'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "df05fadd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# model.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "25236735",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 绘制模型结构  \n",
    "# GraphViz's executables not found  \n",
    "# 安装软件，配置环境变量即可  https://graphviz.org/download/\n",
    "# from tensorflow import keras\n",
    "# keras.utils.plot_model(model, to_file='./shared_bottom.png', show_shapes=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eb7b39f2",
   "metadata": {},
   "source": [
    "## 模型的训练和预测"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "4f373ebd",
   "metadata": {},
   "outputs": [],
   "source": [
    "label_duration = tf.keras.backend.constant(train_data['duration'].values)\n",
    "label_click = tf.keras.backend.constant(train_data['click'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "01764354",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 构建数据管道\n",
    "batch_size = 128\n",
    "train_ds = tf.data.Dataset.from_tensor_slices((train_model_input, (label_duration, label_click))).shuffle(buffer_size=100).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "5f7fa50a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 模型训练这里，需要用到底层的训练脚本，这里不能用高层keras的API\n",
    "optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "e25bed0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 这里为了让每个任务又可比性，还是用loss值进行评估\n",
    "train_loss = tf.keras.metrics.Mean(name='train_loss')\n",
    "train_reg_loss = tf.keras.metrics.Mean(name='train_reg_loss')\n",
    "train_bin_loss = tf.keras.metrics.Mean(name='train_bin_loss')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "9f8141c4",
   "metadata": {},
   "outputs": [],
   "source": [
    "# loss fuc这里需要自定义focal_loss\n",
    "def focal_loss_binary(y_true, y_pred, gamma=2., alpha=.25):\n",
    "    \"\"\"\n",
    "     Multi-labels Focal loss formula:\n",
    "            FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)\n",
    "                 ,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.\n",
    "    \"\"\"\n",
    "    # 这里是过滤，对于正样本，y_true等于1的位置保留y_pred，为0的地方置为1， 因为log1=0，负样本保留为0的地方，log1-0=0\n",
    "    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))\n",
    "    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))\n",
    "    \n",
    "    return  -K.mean(alpha * K.pow(1.-pt_1, gamma) * K.log(pt_1+K.epsilon())) - K.mean((1-alpha) * K.pow(pt_0, gamma) * K.log(1.-pt_0+K.epsilon()))\n",
    "\n",
    "def focal_loss_reg(y_true, y_pred, gamma=2.):\n",
    "    \n",
    "    mse = 1 / 2 * K.pow(y_true-y_pred, 2)\n",
    "    # 保证在0-1之间\n",
    "    mse = tf.nn.sigmoid(mse)\n",
    "    return -K.mean(K.pow(1.-mse, gamma) * K.log(mse))\n",
    "\n",
    "def focal_task_weight(k, r):\n",
    "    # 保证在0-1之间\n",
    "    k = tf.nn.sigmoid(k)\n",
    "    return -K.pow(1-k, r) * K.log(k)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "e9d77f25",
   "metadata": {},
   "outputs": [],
   "source": [
    "loss_func = {\"binary\": focal_loss_binary, \"regression\": focal_loss_reg}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "ae2f0ef9",
   "metadata": {},
   "outputs": [],
   "source": [
    "@tf.function\n",
    "def train_step(features, labels, task_types, weight=[1.0, 1.0], gamma_0=1.0):\n",
    "    losses = []\n",
    "    \n",
    "    with tf.GradientTape() as tape:\n",
    "        # 遍历每个任务\n",
    "        for i, task_type in enumerate(task_types):\n",
    "            out = model(features, training=True)\n",
    "            task_loss = loss_func[task_types[i]](out[i], labels[i], gamma_0)\n",
    "\n",
    "            losses.append(weight[i] * task_loss)\n",
    "            \n",
    "        loss = tf.add_n(losses)\n",
    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
    "    \n",
    "    optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
    "    train_loss.update_state(loss)\n",
    "    train_reg_loss.update_state(losses[0])\n",
    "    train_bin_loss.update_state(losses[1])\n",
    "    return loss, losses[0], losses[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "ca4d17f6",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  0%|                                                                                            | 0/2 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n",
      " 50%|██████████████████████████████████████████                                          | 1/2 [00:19<00:19, 19.44s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 0, Loss: 3.3285491466522217 - regression_loss: 0.00014336765161715448 - binary_loss:3.3284056186676025, loss_weight: 0.3465128540992737-0.0012190319830551744\n",
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:Gradients do not exist for variables ['dense_7/kernel:0', 'click/global_bias:0'] when minimizing the loss.\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:34<00:00, 17.06s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1, Loss: 0.004154697526246309 - regression_loss: 0.0 - binary_loss:0.004154697526246309, loss_weight: 0.3465735912322998-0.34481820464134216\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "epochs = 2\n",
    "batch_nums = math.ceil(train_data.shape[0] / batch_size)\n",
    "\n",
    "task_types = [\"regression\", \"binary\"]\n",
    "task_weight = [1.0, 1.0]\n",
    "task_gamma = [1.0, 1.0]\n",
    "\n",
    "for epoch in tqdm(range(epochs)):\n",
    "    \n",
    "    train_loss.reset_states()\n",
    "    train_reg_loss.reset_states()\n",
    "    train_bin_loss.reset_states()\n",
    "\n",
    "    for feature, labels in train_ds:\n",
    "        loss, loss_reg, loss_bin = train_step(feature, labels, task_types, task_weight)\n",
    "    \n",
    "    # 更新task weight FL(kt,rt)\n",
    "    task_weight = [focal_task_weight(train_reg_loss.result(), task_gamma[0]), focal_task_weight(train_bin_loss.result(), task_gamma[1])]\n",
    "\n",
    "    template = 'Epoch {}, Loss: {} - regression_loss: {} - binary_loss:{}, loss_weight: {}-{}'\n",
    "    print(template.format(epoch, train_loss.result(), \n",
    "                          train_reg_loss.result(),\n",
    "                          train_bin_loss.result(), task_weight[0], task_weight[1]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "8d671e57",
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_ans = model.predict(test_model_input, batch_size=256)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "32dc4aa3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test click AUC 0.4885\n"
     ]
    }
   ],
   "source": [
    "print(\"test click AUC\", round(roc_auc_score(test_data['click'], pred_ans[1]), 4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "6c377e25",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test duration 21146.9607\n"
     ]
    }
   ],
   "source": [
    "print(\"test duration\", round(mean_absolute_error(duration_mms.inverse_transform(test_data['duration'].values.reshape(-1, 1)), \n",
    "                                                 duration_mms.inverse_transform(pred_ans[0]).reshape(-1, 1)), 4))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f28588fa",
   "metadata": {},
   "source": [
    "这种一个回归一个分类的时候，回归这个不行直接。 不能这么玩。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
